# load up modules
import pandas as pd
import numpy as np
# load for visuals
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
# set up notebook to display multiple output in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# read in file from module 5
df = pd.read_csv('beer_reviews_final.csv')
# what is the shape of the data
df.shape
# look at first five records
df.head()
# check for nulls and for data types
# if you did your assignment correctly for module 5, there should be no nulls
df.info()
# let's check out our date info
yearmonth = df['review_date'].str[0:7]
# We have all of 2011 and part of January 2012
yearmonth.value_counts().sort_index()
# TODO show the shape of your data
df.shape
# TODO only keep 2011 data for analysis
range = (df['review_date'] > '2010-12-31') & (df['review_date'] <= '2011-12-32')
df=df.loc[range]
df.head()
# TODO show the new shape of the data
df.shape
# TODO show your dataframe contains 2011 data with monthly counts
yearmonth2 = df['review_date'].str[0:7]
yearmonth2.value_counts().sort_index()
In examination of beer_abv using a boxplot, there appears to be outliers beyond 15 percent alcohol.
What are reasonable alcohol levels for beer? According to the link provided, average alcohol levels for beer are around the 5 percent level. https://www.alcohol.org/statistics-information/abv/
There are a lot of values over 15, so next we'll take a closer look at them. Note that it is possible to specify the percentiles within describe( ).
df['beer_abv'].describe(percentiles = [.25, .5, .75, .95])
df.boxplot(column = 'beer_abv')
Now we can isolate the beers with alcohol content over 15% and take a closer look.
# isolate beers with alcohol content over 15
x = df[df['beer_abv'] > 15]
# how many reviews are there? 2513
len(x)
# how many unique beers is that? 94
x['beer_name'].unique().shape
# do they look like valid abv values? Or are they mislabeled?
x
#TODO show the shape of your data
df.shape
#TODO Drop all beers with an alcohol content under 1 percent and over 11.5 percent
df = df[ (df['beer_abv'] < 11.5) & (df['beer_abv'] > 1 )]
df.head()
#TODO show the shape of your data
df.shape
#TODO display another boxplot to show beer_abv
df.boxplot(column = 'beer_abv')
# now look at how the change to the data affected the description stats
df['beer_abv'].describe(percentiles = [.25, .5, .75, .95])
Since we're only interested in the data from 2011, we can extract just the month from our data and create a new column for this.
# create a new column for month
df['month'] = df['review_date'].str[5:7]
df['month'].value_counts()
One of the questions from our client is about seasonal beers. With this in mind, we'll create a dictionary called seasons that we will use later in our analysis. We will use our month variable as the key. We will do more with dictionaries next week, but until then here is some more information on them: dictionaries
Note that map( ) is a built-in function that is an iteration tool. More information can be found here: built-in functions
# create a dictionary called 'seasons' using the 'month' variable
seasons = {'01' : 'Winter', '12' : 'Winter', '02' : 'Winter',
'03' : 'Spring', '04' : 'Spring', '05' : 'Spring',
'06' : 'Summer', '07' : 'Summer', '08' : 'Summer',
'09' : 'Fall', '10' : 'Fall', '11' : 'Fall'}
df['season'] = df['month'].map(seasons)
df.sample(5)
We will create a correlation matrix and Seaborn heatmap to investigate how rating, taste, and alcohol level are related. More information on creating a heatmap using Seaborn can be found here: Seaborn heatmap
What does the correlation matrix below tell us? It tells us that there is a high positive correlation between the overall review score and the taste score which makes sense since good tasting beer should get a high overall rating. The red colored boxes in the heat map indicate slight negative relationships betwen the scores shown and the beer alcohol level. This is an interesting finding because it suggests that the amount of alcohol in the beer doesn't matter that much in terms of the ratings.
Notice that the correlation matrix and the heatmap are two different ways to present the same data.
# setting the columns to correlate
columns = ['review_overall','review_taste', 'beer_abv']
df_corr = df[columns]
# running the correlation
df_corr.corr()
# setting up the heatmap
corrmat = df_corr.corr()
# set the figure size
f, ax = plt.subplots(figsize=(9, 6))
# pass the data and set the parameters
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, cmap='RdYlBu', linewidths=.5 )
plt.title('Heatmap Beer Review ratings')
# images can be saved - default is .png
# https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.savefig.html
plt.savefig('Correlation Heat Map Beer Reviews')
# TODO create a data matrix using all six numeric variables
columns = ['review_overall','review_taste', 'beer_abv','review_aroma','review_appearance','review_palate']
df_corr = df[columns]
df_corr.corr()
# TODO create a heat map using all six numeric variables. Pick a new color combination.
# https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html
corrmat = df_corr.corr()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, cmap='cool', linewidths=.5 )
plt.title('Heatmap Beer Review ratings')
plt.savefig('Correlation Heat Map Beer Reviews')
#TODO explain how the visual cues of the heatmap represent the correlactions.
print("A color closer to pink represents higher correlation, while teel represents low correlation.")
The review_overall score was highly correlated enough with all of the other review scores that we will use just the review_overall score for our analysis on top beers.
Let's look at top beers three different ways: by brewery, by style, and by individual beer.
# look at mean of overall review and the number of reviews
brewery =df['review_overall'].groupby(df['brewery_name']).agg(['mean','count'])
# Notice that looking at just the mean is misleading as those that have a rating 5.0 have only 1
# or 2 reviews
# we could eliminate the low count, or instead focus on the high count
#
brewery.sort_values(by=['count'], ascending = False)[:20]
brewery.sort_values(by=['mean'], ascending = False)[:10]
# repeat for beer style
beerStyle =df['review_overall'].groupby(df['beer_style']).agg(['mean','count'])
beerStyle.sort_values(by=['count'], ascending = False)[:20]
# repeat for beer name
beers =df['review_overall'].groupby(df['beer_name']).agg(['mean','count'])
beers.sort_values(by=['count'], ascending = False)[:20]
# a table heat map can help point out top values
z = brewery.sort_values(by=['count'], ascending = False)[:50]
z.style.background_gradient(cmap = 'Blues')
# what does this heatmap help us see?
# top breweries to investigate based on review_overall
# Russian River Brewing Company
# Brasserie Cantillian
# Founders Brewing Company
# Surly Brewing Company
# TODO create a heatmap for beer styles
a= beerStyle.sort_values(by=['count'], ascending = False)[:50]
a.style.background_gradient(cmap = 'Reds')
# TODO list top three beer styles based on mean rating
style_mean= beerStyle.sort_values(by=['mean'], ascending = False)[:3]
style_mean
# TODO create a heatmap for individual beers
b= beers.sort_values(by=['count'], ascending = False)[:50]
b.style.background_gradient(cmap = 'Greens')
# TODO list top three beers based on mean rating
beer_mean= beers.sort_values(by=['mean'], ascending = False)[:3]
beer_mean
There are plenty of ways to slice and dice data. A heatmap is a nice visual, but there are other ways to analyse the data. Below is an example of an easy way to change the count number to see if lower count reviews have a higher review score.
Try changing the comparison value of 600 to 400 and see how the results change.
# set the comparison value to more than 600
temp = beers[beers['count'] > 600]
temp['mean'].nlargest(5)
# change the comparison value of 600 above to 400 and see how the results change
Next we will look at our total number of beer reviews by day. Note that when data is stored in a csv file, it does not retain the date field type; the review_date in this module was read in as an Object - which is the default.
There are a few ways to handle date fields. If you know you are reading in a date field from a csv file, you can specify so in the read_csv command: df = pd.read_csv('beer6.csv',parse_dates = ['review_date'])
Or you can convert a date in an Object field into a Date field, which is shown below.
# convert review_date to a date format
df['review_date'] = pd.to_datetime(df['review_date'])
df.info()
# to plot by date, we need one sum for each date
# lets group by date and create a df that we can plot
df_date = pd.DataFrame(df['review_overall'].groupby(df['review_date']).count())
df_date.sample(5)
# the date is the index and it needs to be reset so it can be used as a regular column
df_date = df_date.reset_index()
df_date.info()
Shown below are plots of the beer review counts for each day. The top graph is using Matplotlib and the bottom graph is using Plotly. Here is more information on those:
Both plot styles have plenty of features that can be customized and you are encouraged to experiement with the customizations. We will be updating the plot title along with the labels for the x and y axis.
# matplotlib version of plot
fig, ax = plt.subplots(figsize = (8,8))
ax.plot(df_date['review_date'], df_date['review_overall'])
ax.set(xlabel = 'Review Date')
# plotly version of plot - notice the info on hover
fig = px.line(df_date, x = 'review_date', y = 'review_overall',
title='Beer Review Count by Date')
fig.update_layout(height = 600, xaxis_title = 'Review Date')
# hover over Feb 13 to see counts
# TODO create your dataframe that groups the review_overall count by month
df_month = pd.DataFrame(df['review_overall'].groupby(df['month']).count())
df_month
df_month = df_month.reset_index()
df_month.info()
# TODO create a plot with Month on the x axis and counts on the y axis; Include a title, x and y axis label.
fig, ax = plt.subplots(figsize = (8,8))
ax.plot(df_month['month'], df_month['review_overall'])
ax.set(xlabel = 'month')
We already created a variable so that each review has a value of either Summer, Sprint, Winter or Fall. We want to know if there are beers that have a high number of ratings in one season which suggests they are a special beer with seasonal release.
df50 = pd.DataFrame(pd.crosstab(df['beer_name'],df['season']))
# get a total count of reviews per beer
df50['Total'] = df50['Fall'] + df50['Spring'] + df50['Summer'] + df50['Winter']
df50.head(10)
# We don't want beers with few reviews, so only keep beers with 50 or more reviews
df50 = df50[df50['Total'] >= 50]
df50 = df50.reset_index()
df50.head()
# lets caculate percentages of total for each season
df50['fallPercent'] = (df50['Fall']/df50['Total']) * 100
df50['springPercent'] = (df50['Spring']/df50['Total']) * 100
df50['summerPercent'] = (df50['Summer']/df50['Total']) * 100
df50['winterPercent'] = (df50['Winter']/df50['Total']) * 100
df50.info()
df50.sample(5)
# let's look at Spring to see if any beers have the majority of reviews in Spring
df50[df50['springPercent'] > 75]
# TODO Show the beer review counts for each season with over 75 percent per season
# TODO Which beer(s) would you suggest that the client to look at in regards to a seasonal beer and why?